#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @FileName  :基于各栏目json链接获取item_id.py
# @Time      :2024/5/31 
# @Author    :CL
# @email     :1037654919@qq.com
import re
from util import mongo_manager, proxies  # proxies
from multiprocessing import Pool

xuexiqiangguo_item_id  = mongo_manager('xuexiqiangguo_item_id',db='public_data')


def  get_item_id(    url = "https://www.xuexi.cn/lgdata/19vhj0omh73.json"): #1ahjpjgb4n3

    headers = {
        "accept": "application/json",
        "accept-language": "zh-CN,zh;q=0.9",
        "cache-control": "no-cache",
        "pragma": "no-cache",
        "priority": "u=1, i",
        "referer": "https://www.xuexi.cn/89acb6d339cd09d5aaf0c2697b6a3278/9a3668c13f6e303932b5e0e100fc248b.html",
        "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Linux\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    }
    cookies = {
        "__UID__": "15344de0-11ef-11ef-816a-d722d43974a4"
    }

    params = {
        "_st": "28594840",
        "js_v": "1706580237911"
    }
    response = requests.get(url, headers=headers, cookies=cookies, params=params)

    print(response.url,response)
    return response.text
# 核心入口 基于json_urls 获取所有item_id
def main():
    #借鉴别人代码获取到的json链接，尚不知来源
    json_urls  = {
            #学习新思想
            # "zhongyaoxinwen":"https://www.xuexi.cn/lgdata/1jscb6pu1n2.json?_st=26095725",
            "zhongyaohuodong":"https://www.xuexi.cn/lgdata/1jpuhp6fn73.json?_st=26095746",
            "zhongyaohuiyi":"https://www.xuexi.cn/lgdata/19vhj0omh73.json?_st=26095747",
            "zhongyaojianghua":"https://www.xuexi.cn/lgdata/132gdqo7l73.json?_st=26095749",
            "zhongyaowenzhang":"https://www.xuexi.cn/lgdata/1ahjpjgb4n3.json?_st=26095750",
            "chuguofangwen":"https://www.xuexi.cn/lgdata/1je1objnh73.json?_st=26095752",
            "zhishipishi":"https://www.xuexi.cn/lgdata/1kvrj9vvv73.json?_st=26095752",
            "handianzhici":"https://www.xuexi.cn/lgdata/17qonfb74n3.json?_st=26095753",
            "xinshidaijishi":"https://www.xuexi.cn/lgdata/1i30sdhg0n3.json?_st=26095754",
            "xuexishipin":"https://www.xuexi.cn/lgdata/1ap1igfgdn2.json?_st=26095755",
            "zonghexinwen":"https://www.xuexi.cn/lgdata/1ajhkle8l72.json?_st=26095756",
            "toutiaoxinwei":"https://www.xuexi.cn/lgdata/1crqb964p71.json?_st=26095757",
            #十九大时间
            "shijiudawenxian":"https://www.xuexi.cn/lgdata/11d24l914n4.json?_st=26095831",
            "shijiudabaogao":"https://www.xuexi.cn/lgdata/1a78j52k2n4.json?_st=26095832",
            "shijiujiezhongyangquanhui":"https://www.xuexi.cn/lgdata/1c7mgi6tg74.json?_st=26095835",
            "shijiujiezhongyangjiweiquanhui":"https://www.xuexi.cn/lgdata/1niulj5tbn4.json?_st=26095836",
            "yanshenyuedu":"https://www.xuexi.cn/lgdata/1nf12u57o74.json?_st=26096131",
            "niwenwoda":"https://www.xuexi.cn/lgdata/1jfhm81amn4.json?_st=26096132",
            "xueximianduimian":"https://www.xuexi.cn/lgdata/11gg7rev674.json?_st=26096133",
            #学习理论
            "xuexililun":"https://www.xuexi.cn/lgdata/u1ght1omn2.json?_st=26096137",
            #红色中国
            "yongyuandefengbei":"https://www.xuexi.cn/lgdata/1n544qrtv7c.json?_st=26096145",
            #视频需要用视频爬虫去单独处理
            #"shipinzhuanqu":"https://www.xuexi.cn/lgdata/3jsf4shrl928.json?_st=26096148",
            ##红色中国-红色记忆
            "changzhengjinianguan":"https://www.xuexi.cn/lgdata/u16ui97tnm.json?_st=26096163",
            "kangzhanjinianguan":"https://www.xuexi.cn/lgdata/1c0be8revnm.json?_st=26096164",
            "jiefangzhanzheng":"https://www.xuexi.cn/lgdata/1bh4bl63fnm.json?_st=26096164",
            "hongselvyou":"https://www.xuexi.cn/lgdata/181gpjpb4nm.json?_st=26096165",
            "hongselvyouluxian":"https://www.xuexi.cn/lgdata/1dil8gmtq7m.json?_st=26096166",
            "lijiedangdaihui":"https://www.xuexi.cn/lgdata/1of97bn6c7m.json?_st=26096168",
            #红色中国-党史研究
            "dangshigushi":"https://www.xuexi.cn/lgdata/1drkih7p27m.json?_st=26096167",
            "dangshizhishi":"https://www.xuexi.cn/lgdata/1k8ffl9m8nm.json?_st=26096170",
            "dangshiyanjiu":"https://www.xuexi.cn/lgdata/1ogogofuqnm.json?_st=26096170",
            #红色中国-中国精神研究
            "wusijingshen":"https://www.xuexi.cn/lgdata/1ibpde47i7l.json?_st=26096176",
            "hongchuanjingshen":"https://www.xuexi.cn/lgdata/ue2lvnpl7l.json?_st=26096176",
            "jinggangshanjingshen":"https://www.xuexi.cn/lgdata/uo3b9nde7l.json?_st=26096177",
            "changzhengjingshen":"https://www.xuexi.cn/lgdata/136cgvrgp7l.json?_st=26096178",
            "yananjingshen":"https://www.xuexi.cn/lgdata/1ebgr501e7l.json?_st=26096179",
            "taihangjingshen":"https://www.xuexi.cn/lgdata/10444suc57l.json?_st=26096179",
            "yimengjingshen":"https://www.xuexi.cn/lgdata/1nf7dv27p7l.json?_st=26096180",
            "xibaipojingshen":"https://www.xuexi.cn/lgdata/1gqebjagq7l.json?_st=26096181",
            "tierenjingshen":"https://www.xuexi.cn/lgdata/1ganajvkg7l.json?_st=26096181",
            "jiaoyulujingshen":"https://www.xuexi.cn/lgdata/16td1roi77l.json?_st=26096182",
            "liangdanyixingjingshen":"https://www.xuexi.cn/lgdata/17ictec57nl.json?_st=26096182",
            "hansaibajingshen":"https://www.xuexi.cn/lgdata/11fksd93o7l.json?_st=26096183",
            "gaigekaifangjingshen":"https://www.xuexi.cn/lgdata/2qvbqhmdrube.json?_st=26096184",
            #学习科学
            "kejisixiangyanjiu":"https://www.xuexi.cn/lgdata/1eppcq11fne.json?_st=26096190",
            "kexuejingshentan":"https://www.xuexi.cn/lgdata/1armpdlt5ne.json?_st=26096194",
            "guojiagongcheng":"https://www.xuexi.cn/lgdata/1moa0khf17e.json?_st=26096195",
            "5G":"https://www.xuexi.cn/lgdata/56j1nv2difvo.json?_st=26096196",
            "kejiqianyan":"https://www.xuexi.cn/lgdata/152ijthp37e.json?_st=26096196",
            "dangdaikexuejiagushi":"https://www.xuexi.cn/lgdata/11jihrmq37e.json?_st=26096198",
            "yixianfengcai":"https://www.xuexi.cn/lgdata/1cieuomejnn.json?_st=26096198",
            "zhengcejiedu":"https://www.xuexi.cn/lgdata/1lje05c9une.json?_st=26096200",
            "kepuzhishi":"https://www.xuexi.cn/lgdata/1drofao4h7e.json?_st=26096205",
            "zhongguokejishi":"https://www.xuexi.cn/lgdata/110jqimatnn.json?_st=26096206",
            "zhongguolidaikexuejia":"https://www.xuexi.cn/lgdata/153hr7eadnn.json?_st=26096206",
            "waiguokexuejia":"https://www.xuexi.cn/lgdata/14ddfon4e7n.json?_st=26096207",
            "shijiekejishi":"https://www.xuexi.cn/lgdata/14gko3bjk7n.json?_st=26096208",
            #科学著作是pdf文本，爬到存数据库不好存，暂时去掉
            #"kexuezhuzuo":"https://www.xuexi.cn/lgdata/u6i3lnss7e.json?_st=26096210",
            "xinlifudao":"https://www.xuexi.cn/lgdata/1h4s6pojfne.json?_st=26096211",
            #环球视野
            "xijinpingwaijiaosixiang":"https://www.xuexi.cn/lgdata/1ooaa665snf.json?_st=26096213",
            "shijieyanzhongdexijinping":"https://www.xuexi.cn/lgdata/vdppiu92n1.json?_st=26096214",
            #环球视野-一带一路
            "xinwenzixun":"https://www.xuexi.cn/lgdata/1kok79h5s7n.json?_st=26096216",
            "zhengcehuanjing":"https://www.xuexi.cn/lgdata/1mjdmg8mtnn.json?_st=26096216",
            "hulianhutong":"https://www.xuexi.cn/lgdata/1kb4calll7n.json?_st=26096217",
            "guojihezuo":"https://www.xuexi.cn/lgdata/t1u2cdg6nn.json?_st=26096218",
            "jicushuju":"https://www.xuexi.cn/lgdata/1dqdq0hj07n.json?_st=26096218",
            "gonghuasilu":"https://www.xuexi.cn/lgdata/1kmjuu09c7n.json?_st=26096219",
            "wenboyaniu":"https://www.xuexi.cn/lgdata/1063lvdd6nd.json?_st=26096222",
            #视频，先去掉
            #"wenbogongkiake":"https://www.xuexi.cn/lgdata/1o2r10b6f7d.json?_st=26096223",
            #"wenbojilupian":"https://www.xuexi.cn/lgdata/17j565ghcnd.json?_st=26096224",
            #习近平文汇:栏目内容较杂，先去掉
            #学习电视台和学习慕课都是视频：先去掉
            #学习文化：栏目很多，先爬建筑、武术、楹联、医药，其余的基本功能满足后再加
            #学习文化-中国建筑：建筑与文化、建筑知识栏目数据不是通过json生成，而是在data+MD5里，先去掉
            "gudaijianzhujicui":"https://www.xuexi.cn/lgdata/v4pq4uth7d.json?_st=26096242",
            "jindaijianzhujicui":"https://www.xuexi.cn/lgdata/1c59olfnvnd.json?_st=26096243",
            "jianshewenhuayujianzhuyanjiu":"https://www.xuexi.cn/lgdata/1oog782287d.json?_st=26096246",
            "jianzhumingjia":"https://www.xuexi.cn/lgdata/tnigd8qund.json?_st=26096248",
            #学习文化-中华武术,中华武术教学是视频，先去掉
            "wushuyanbian":"https://www.xuexi.cn/lgdata/1lnc6c84mnd.json?_st=26096250",
            "zhonghuashangwujingshen":"https://www.xuexi.cn/lgdata/11j85c92mnd.json?_st=26096252",
            "zhonghuawuhun":"https://www.xuexi.cn/lgdata/13d9au2i0nd.json?_st=26096253",
            "zhonghuawuxue":"https://www.xuexi.cn/lgdata/vb0qh7so7d.json?_st=26096254",
            #学习文化-中华医药
            "zhongyidianji":"https://www.xuexi.cn/lgdata/urm3g97vnn.json?_st=26096257",
            "lidaimingyi":"https://www.xuexi.cn/lgdata/168delc1d7e.json?_st=26096259",
            "dangdaimingyi":"https://www.xuexi.cn/lgdata/18r1mt5nh7e.json?_st=26096260",
            "zhonghuayiyaoyushijie":"https://www.xuexi.cn/lgdata/15q5l76icne.json?_st=26096262",
            #学习文化-中华楹联,楹联视频先去掉
            "minglianjianshang":"https://www.xuexi.cn/lgdata/1i276vso3ne.json?_st=26096267",
            #学习文化-中华楹联-楹联与习俗-楹联习俗
            "chunlianxisu":"https://www.xuexi.cn/lgdata/4j8eurk302bq.json?_st=26096268",
            #学习文化-中华楹联-楹联与习俗-节令楹联
            "jielingyinglian":"https://www.xuexi.cn/lgdata/4b0a3rjqb9uq.json?_st=26096270",
            #学习文化-中华楹联-楹联与习俗-行业楹联
            "hangyeyinglian":"https://www.xuexi.cn/lgdata/48oc8va2veoh.json?_st=26096271",
            #学习文化-中华楹联-楹联与习俗-喜庆楹联
            "xiqingyinglian":"https://www.xuexi.cn/lgdata/4qkgon8lvjdj.json?_st=26096272",
            #学习文化-中华楹联-楹联与习俗-宗教楹联
            "zongjiaoyinglian":"https://www.xuexi.cn/lgdata/3m1efumdciph.json?_st=26096273",
            #学习文化-中华楹联-楹联与习俗-其他楹联
            "qitayinglian":"https://www.xuexi.cn/lgdata/4a3hb4kkg5v4.json?_st=26096275",
            #学习文化-中华楹联-楹联知识
            "yinglianzhishi":"https://www.xuexi.cn/lgdata/10jabihga7e.json?_st=26096277",
            #强军兴军：1、习近平强军思想研究 2、强军时评 3、学习军史：军事家、军史故事、军史档案、军史文物4、古代军事家 5、古代兵器 6、兵器大观
            #其他的多数是图片和视频，先去掉
            "xijinpingqiangjunsixiangyanjiu":"https://www.xuexi.cn/lgdata/12lm260c37e.json?_st=26096287",
            "qiangjushiping":"https://www.xuexi.cn/lgdata/1j2fuv9rs7e.json?_st=26096378",
            "junshijia":"https://www.xuexi.cn/lgdata/16ap1a07pnn.json?_st=26096382",
            "junshigushi":"https://www.xuexi.cn/lgdata/178c76irs7n.json?_st=26096383",
            "junshidangan":"https://www.xuexi.cn/lgdata/1nan78i05nn.json?_st=26096384",
            "junshirenwu":"https://www.xuexi.cn/lgdata/12l486vm97n.json?_st=26096385",
            "gudaijunshijia":"https://www.xuexi.cn/lgdata/1mo5h6vk07f.json?_st=26096387",
            "gudaibingqi":"https://www.xuexi.cn/lgdata/ug4o30g07f.json?_st=26096390",
            "bingqidaguan":"https://www.xuexi.cn/lgdata/1gd1n2n667f.json?_st=26096445",
            #美丽中国:1、生态文明建设思想研究2、生态文明建设实践3、走遍中国4、记住乡愁5、历史文化名城6、历史文化名镇7、历史文化名街
            "shengtaiwenmingjianshesixiangyanjiu":"https://www.xuexi.cn/lgdata/1ahi87vjg7e.json?_st=26096452",
            "shengtaiwenmingjiansheshijian":"https://www.xuexi.cn/lgdata/1eiarm6b5ne.json?_st=26096462",
            "zoubianzhongguo":"https://www.xuexi.cn/lgdata/3alk4pkja8hl.json?_st=26096463",
            "jizhuxiangchou":"https://www.xuexi.cn/lgdata/dfo6qrb8n1p.json?_st=26096457",
            "lishiwenhuamingcheng":"https://www.xuexi.cn/lgdata/12k7uv48b7e.json?_st=26096458",
            "lishiwenhuamingzhen":"https://www.xuexi.cn/lgdata/1ogcjrb5c7e.json?_st=26096466",
            "lishiwenhuamingjie":"https://www.xuexi.cn/lgdata/1b04bc5p5ne.json?_st=26096467"

            }
    for label,url in json_urls.items():
        res = get_item_id(url = url)
        text = str(res)
        # 使用正则表达式查找链接及对应的item_id 能直接到item的链接形式
        links_with_item_id = re.findall(r'(https?://\S+&item_id=(\d+))', text)
        links_with_item_id = list(set(links_with_item_id))
        # 打印找到的链接及对应的item_id
        print(f"Label: {label}, len Links with item_id: {len(links_with_item_id)}")
        for link, item_id in links_with_item_id:
            # print(f"Link: {link}, Item ID: {item_id}")
            try:
                xuexiqiangguo_item_id.insertOne({"_id": item_id, 'label': label, 'item_id': item_id,})
            except Exception as e:
                print(e)

if __name__ == '__main__':
    print()
    main()